library(tidyverse)
## ── Attaching packages ──────────
## ✔ ggplot2 2.2.1     ✔ purrr   0.2.4
## ✔ tibble  1.3.4     ✔ dplyr   0.7.4
## ✔ tidyr   0.7.2     ✔ stringr 1.2.0
## ✔ readr   1.1.1     ✔ forcats 0.2.0
## ── Conflicts ───────────────────
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(janitor)
library(stringr)
library(forcats)
library(viridis)
## Loading required package: viridisLite
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
## Import and clean the data "Instacart".
instacart = read_csv("./instacart_train_data.csv.zip") %>%
  clean_names()
## Warning in strptime(x, format, tz = tz): unknown timezone 'zone/tz/2017c.
## 1.0/zoneinfo/America/New_York'
## Parsed with column specification:
## cols(
##   order_id = col_integer(),
##   product_id = col_integer(),
##   add_to_cart_order = col_integer(),
##   reordered = col_integer(),
##   user_id = col_integer(),
##   eval_set = col_character(),
##   order_number = col_integer(),
##   order_dow = col_integer(),
##   order_hour_of_day = col_integer(),
##   days_since_prior_order = col_integer(),
##   product_name = col_character(),
##   aisle_id = col_integer(),
##   department_id = col_integer(),
##   aisle = col_character(),
##   department = col_character()
## )

Column

Barplot - Item counts by aisles

How many items ordered for each department?

instacart %>% 
  count(department) %>% 
  rename(items = n) %>% 
  mutate(department = fct_reorder(department,items)) %>% 
  plot_ly(x = ~department, y = ~items, color = ~department, type = "bar")
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

Column

Boxplot - Order hour of the day

What is the distribution of the order hour of the day for each department? It seems that most orders are between 8.00-18.00.

instacart %>%
  mutate(department = fct_reorder(department, order_hour_of_day, IQR)) %>%
  select(department, order_hour_of_day) %>%
## The IQR of each department was in increasing order.
  plot_ly(y = ~order_hour_of_day, color = ~department, type = "box", colors = "Set2")

Scatterplot - Product reorder ratio

Products with a high number of orders are naturally more likely to be reordered. However, there seems to be a ceiling effect.

## reorder
reordered = instacart %>% 
  filter(reordered == 1) %>% 
  count(product_id) %>% 
  rename(reordered = n)

instacart %>% 
  count(product_id, department) %>%
  rename(ordered = n) %>% 
  left_join(reordered) %>% 
  mutate(reordered = ifelse(is.na(reordered),0,reordered),
         reo_ratio = reordered/ordered,
         tlabel = str_c("Product_ID: ", product_id, product_id, '\nDepartment: ', department)) %>%
  plot_ly(x = ~ordered,y = ~reo_ratio, type = "scatter", mode = "markers", alpha = 0.5, text = ~tlabel, color = I("black"))
## Joining, by = "product_id"